from IPython.display import Image
Image(url='https://img4.cityrealty.com/neo/i/p/mig/airbnb_guide.jpg')
We are focusing on California’s data for analysis. Being new to California, it would be interesting to learn about places which people prefer for AirBnb in California. Our main focus was around affordability, customer preference for AirBnb property based on customer reviews and how close are these AirBnb properties from airport, beach and other touristy location.
https://www.kaggle.com/gkdoad/airbnb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import json
import branca
pd.set_option('display.max_columns',50)
pd.options.display.max_rows
# Setting a base map
lat = 34.1000
long = -118.4662
initFile = pd.read_csv('listings.csv',index_col= 'id',low_memory=False)
initFile.head(1)
initFile.first_review.count()
initFile.first_review.sort_values(ascending=True).head(1)
initFile.head()
initFile.reset_index(inplace=True)
initFile['colid'] = initFile.index
initFile.drop('id',axis=1,inplace=True)
initFile.rename(columns={'colid':'id'},inplace=True)
initFile.id +=1
initFile.set_index('id',inplace=True)
initFile.head(1)
df_cleansed = initFile.copy()
### Taking only the CA listings and making sure everything in state as uppercase
df_cleansed.state.loc[df_cleansed.state=='California'] = 'CA'
df_cleansed.state.unique()
df_cleansed.state = df_cleansed['state'].str.upper()
## As we can see there are some unwanted & non-CA entries in the state column
df_cleansed.state.value_counts()
df_cleansed.drop(df_cleansed[~(df_cleansed.state.values=='CA')].index,axis=0,inplace=True)
df_cleansed.city = df_cleansed.city.str.lower()
df_cleansed.city.unique()
df_cleansed.city.dropna(inplace=True)
df_cleansed['city'] = df_cleansed['city'].str.replace(r'[^\x00-\x7F]+', '')
df_cleansed.city.dropna(inplace=True)
df_cleansed.city.value_counts()
df_cleansed.groupby('city')['neighbourhood_cleansed'].value_counts()
df_cleansed.smart_location.shape
df_cleansed.smart_location = df_cleansed.smart_location.str.lower()
df_cleansed.isnull().sum()
df_cleansed.drop(df_cleansed[(df_cleansed['smart_location'] == '蒙特利帕克, ca') | \
(df_cleansed['smart_location'] == '因達斯特里, ca') | \
(df_cleansed['smart_location'] == '艾尔蒙地, ca') | \
(df_cleansed['smart_location'] == '沃尔纳特, ca') | \
(df_cleansed['smart_location'] == '哈仙达岗, ca') | \
(df_cleansed['smart_location'] == '罗兰高地, ca') | \
(df_cleansed['smart_location'] == '罗兰岗, ca') | \
(df_cleansed['smart_location'] == '天普市, ca') | \
(df_cleansed['smart_location'] == '洛杉矶, ca') \
].index, inplace = True)
df_cleansed.smart_location = df_cleansed.smart_location.str.replace(', ca',' ', regex=True)
df_cleansed.smart_location = df_cleansed.smart_location.str.strip()
## Removing the spaces from the neighbourhood_cleansed
df_cleansed.neighbourhood_cleansed = df_cleansed.neighbourhood_cleansed.str.strip()
## Convert smart_location and neighbourhood_cleansed columns into lower case
df_cleansed.smart_location = df_cleansed.smart_location.str.lower()
df_cleansed.neighbourhood_cleansed = df_cleansed.neighbourhood_cleansed.str.lower()
df_cleansed.head(2)
Affordability analysis: How do prices of listings vary by location
df_Fact1 = df_cleansed[['price','security_deposit','cleaning_fee','guests_included','extra_people','minimum_nights',\
'maximum_nights','amenities','neighbourhood_cleansed',\
'state','zipcode','latitude','longitude']].copy()
## Renaming the neighbourhood_cleansed column to location
df_Fact1.rename(columns={'neighbourhood_cleansed':'location'},inplace=True)
df_Fact1.info()
df_Fact1.isna().sum()
## We see that we have NaN values in the security_deposit, cleaning_fee, zipcode columns
## So, lets clean these columns
## Lets see what the amenities column holds..
df_Fact1.groupby('amenities')['price'].value_counts().nlargest(3)
df_Fact1.amenities = df_Fact1.amenities.str.strip('{}')
df_Fact1.amenities = df_Fact1.amenities.str.replace('"','')
df_Fact1.amenities = df_Fact1.amenities.str.lower()
pd.Series(' '.join(df_Fact1['amenities']).lower().split(',')).value_counts().nlargest(12)
## Taking the most sort after amenities by customers and creating new columns for them
df_Fact1['wifi'] = ''
df_Fact1['ac'] = ''
df_Fact1['pet'] = ''
df_Fact1['park'] = ''
df_Fact1['htub'] = ''
df_Fact1['dishwasher'] = ''
df_Fact1['bbq'] = ''
df_Fact1['pool'] = ''
df_Fact1['hr'] = ''
df_Fact1['coffee'] = ''
df_Fact1['heating'] = ''
df_Fact1['iron'] = ''
df_Fact1['washer'] = ''
df_Fact1['dryer'] = ''
df_Fact1['parking'] = ''
df_Fact1.wifi = df_Fact1.amenities.str.contains("wifi").apply(lambda x: 'wifi' if x==True else '')
df_Fact1.ac = df_Fact1.amenities.str.contains("air conditioning").apply(lambda x: 'air conditioning' if x==True else '')
df_Fact1.pet = df_Fact1.amenities.str.contains("pet").apply(lambda x: 'pet' if x==True else '')
df_Fact1.park = df_Fact1.amenities.str.contains("park").apply(lambda x: 'park' if x==True else '')
df_Fact1.htub = df_Fact1.amenities.str.contains("hot tub").apply(lambda x: 'hot tub' if x==True else '')
df_Fact1.dishwasher = df_Fact1.amenities.str.contains("dishwasher").apply(lambda x: 'dishwasher' if x==True else '')
df_Fact1.bbq = df_Fact1.amenities.str.contains("bbq").apply(lambda x: 'BBQ' if x==True else '')
df_Fact1.pool = df_Fact1.amenities.str.contains("pool").apply(lambda x: 'pool' if x==True else '')
df_Fact1.hr = df_Fact1.amenities.str.contains("24-hour check-in").apply(lambda x: '24-hour check-in' if x==True else '')
df_Fact1.coffee = df_Fact1.amenities.str.contains("coffee").apply(lambda x: 'coffee' if x==True else '')
df_Fact1.heating = df_Fact1.amenities.str.contains("heating").apply(lambda x: 'heating' if x==True else '')
df_Fact1.iron = df_Fact1.amenities.str.contains("iron").apply(lambda x: 'iron' if x==True else '')
df_Fact1.washer = df_Fact1.amenities.str.contains("washer").apply(lambda x: 'washer' if x==True else '')
df_Fact1.dryer = df_Fact1.amenities.str.contains("dryer").apply(lambda x: 'dryer' if x==True else '')
df_Fact1.parking = df_Fact1.amenities.str.contains("parking").apply(lambda x: 'parking' if x==True else '')
df_Fact1.drop('amenities',axis=1,inplace=True)
df_Fact1['amenities'] = df_Fact1.wifi+' '+df_Fact1.ac+' '+df_Fact1.pet+' '+df_Fact1.park+' '+\
df_Fact1.htub+' '+df_Fact1.dishwasher+' '+df_Fact1.bbq+' '+df_Fact1.pool+' '+df_Fact1.hr+' '+df_Fact1.coffee+' '+\
df_Fact1.heating+' '+df_Fact1.iron+' '+df_Fact1.washer+' '+df_Fact1.dryer+' '+df_Fact1.parking
df_Fact1.wifi = df_Fact1.wifi.apply(lambda x: 1 if x=='wifi' else 0)
df_Fact1.ac = df_Fact1.ac.apply(lambda x: 1 if x=='air conditioning' else 0)
df_Fact1.pet = df_Fact1.pet.apply(lambda x: 1 if x=='pet' else 0)
df_Fact1.park = df_Fact1.park.apply(lambda x:1 if x=='park' else 0)
df_Fact1.htub = df_Fact1.htub.apply(lambda x:1 if x=='hot tub' else 0)
df_Fact1.dishwasher = df_Fact1.dishwasher.apply(lambda x:1 if x=='dishwasher' else 0)
df_Fact1.bbq = df_Fact1.bbq.apply(lambda x:1 if x=='BBQ' else 0)
df_Fact1.pool = df_Fact1.pool.apply(lambda x:1 if x=='pool' else 0)
df_Fact1.hr = df_Fact1.hr.apply(lambda x:1 if x=='24-hour check-in' else 0)
df_Fact1.coffee = df_Fact1.coffee.apply(lambda x:1 if x=='coffee' else 0)
df_Fact1.heating = df_Fact1.heating.apply(lambda x:1 if x=='heating' else 0)
df_Fact1.iron = df_Fact1.iron.apply(lambda x:1 if x=='iron' else 0)
df_Fact1.washer = df_Fact1.washer.apply(lambda x:1 if x=='washer' else 0)
df_Fact1.dryer = df_Fact1.dryer.apply(lambda x:1 if x=='dryer' else 0)
df_Fact1.parking = df_Fact1.parking.apply(lambda x:1 if x=='parking' else 0)
df_Fact1.isna().sum()
## Removing the '$' & ',' symbols form the security_deposit column
df_Fact1.security_deposit = df_Fact1.security_deposit.str.strip('$')
df_Fact1.security_deposit = df_Fact1.security_deposit.str.replace(',','')
## Converting the security_deposit column to Float from str
df_Fact1.security_deposit = df_Fact1.security_deposit.astype('float64')
## Understanding the security_deposit column using describe()
df_Fact1.security_deposit.describe()
## Rounding the security_deposit column values after filling the NaN values to mean() of
## the security_deposit values
df_Fact1.security_deposit.fillna(round(df_Fact1.security_deposit.mean(),2),inplace=True)
## Understanding the cleaning_fee column using describe()
df_Fact1.cleaning_fee.describe()
df_Fact1.cleaning_fee.isna().sum()
## Removing the '$' & ',' symbols form the cleaning_fee column
df_Fact1.cleaning_fee = df_Fact1.cleaning_fee.str.strip('$')
df_Fact1.cleaning_fee = df_Fact1.cleaning_fee.str.replace(',','')
## Converting the cleaning_fee column to Float from str
df_Fact1.cleaning_fee = df_Fact1.cleaning_fee.astype('float64')
## Understanding the cleaning_fee column using describe()
df_Fact1.cleaning_fee.describe()
## Converting the cleaning_fee column NaN values to the mean() of values in that column
df_Fact1.cleaning_fee.fillna(round(df_Fact1.cleaning_fee.mean(),2),inplace=True)
## Describing the cleaning_fee column after filling the NaN values in the column
df_Fact1.cleaning_fee.describe()
df_Fact1.isna().sum()
df_Fact1.zipcode.isna().sum()
df_Fact1.zipcode.dropna(inplace=True)
df_Fact1.shape
## Removing the '$' & ',' symbols form the price column
df_Fact1.price = df_Fact1.price.str.strip('$')
df_Fact1.price = df_Fact1.price.str.replace(',','')
## Converting the price column to Float from str
df_Fact1.price = df_Fact1.price.astype('float64')
## Understanding the price column using describe()
df_Fact1.price.describe()
## Price '0' makes no sense as the listing need to charge for per night stay and there are
## only 9 values
df_Fact1.drop(df_Fact1[df_Fact1.price==0].index,axis=0,inplace=True)
df_Fact1.price.count()
## Removing the '$' & ',' symbols form the extra_people column
df_Fact1.extra_people = df_Fact1.extra_people.str.strip('$')
df_Fact1.extra_people = df_Fact1.extra_people.str.replace(',','')
## Converting the extra_people column to Float from str
df_Fact1.extra_people = df_Fact1.extra_people.astype('float64')
## Understanding the extra_people column using describe()
df_Fact1.extra_people.describe()
df_Fact1.guests_included.dtype
df_Fact1.minimum_nights.dtype
df_Fact1.maximum_nights.dtype
## Since the numerical columns are alreay in integer type we do not need to make any change
pd.qcut(df_Fact1.price,10)
df_Fact1['priceType'] = ''
df_Fact1.priceType.loc[(df_Fact1['price']> 9.999) & (df_Fact1['price'] <= 110.0)] = 'Low'#Low
df_Fact1.priceType.loc[(df_Fact1['price'] > 110.0) & (df_Fact1['price'] <= 500.0)] = 'Mid'#Mid
df_Fact1.priceType.loc[(df_Fact1['price'] > 500.0) & (df_Fact1['price'] <= 25000.0)] = 'High'#High
df_Fact1.head(1)
## Making a lowRange dataframe which has all the values with the priceType as "Low"
lowRnge = pd.DataFrame(df_Fact1[df_Fact1.priceType=='Low'])
lowRnge.head(2)
## Importing the markercluster module from the folium library
from folium.plugins import MarkerCluster
mc = MarkerCluster()
## Assigning base_low to save the map
### making the world map to zoom on the latitude and longitude of LA county as maximum
### number of airbnb listings are in LA county.
base_low = folium.Map(location=[lowRnge.latitude.mean(),
lowRnge.longitude.mean()],
zoom_start=10)##This simply zoom in on the CA state
## For every value in the Low range airbnb's put a popup marker with the Airbnb per night
## prices
for row in lowRnge.itertuples():
mc.add_child(folium.Marker(location=[row.latitude, row.longitude],popup=row.price))
base_low.add_child(mc)
## Saving the map in html format in the current directory with the name map_lowprices
base_low.save('map_lowprices.html')
base_low